Carga de paquetes

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.1     ✓ dplyr   1.0.0
## ✓ tidyr   1.1.0     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(cowplot)
## 
## ********************************************************
## Note: As of version 1.0.0, cowplot does not change the
##   default ggplot2 theme anymore. To recover the previous
##   behavior, execute:
##   theme_set(theme_cowplot())
## ********************************************************

Importación de datos

Descargar datos: datos

eukaryotes<-read_csv("data/eukaryotes.csv", col_names = T)
## Parsed with column specification:
## cols(
##   `#Organism Name` = col_character(),
##   `Organism Groups` = col_character(),
##   Strain = col_character(),
##   BioSample = col_character(),
##   BioProject = col_character(),
##   Assembly = col_character(),
##   Level = col_character(),
##   `Size(Mb)` = col_double(),
##   `GC%` = col_double(),
##   Replicons = col_character(),
##   WGS = col_character(),
##   Scaffolds = col_double(),
##   CDS = col_double(),
##   `Release Date` = col_datetime(format = ""),
##   `GenBank FTP` = col_character(),
##   `RefSeq FTP` = col_character()
## )

Exploración y limpieza de datos

eukaryotes %>% glimpse()
## Rows: 5,131
## Columns: 16
## $ `#Organism Name`  <chr> "Emiliania huxleyi CCMP1516", "Arabidopsis thaliana…
## $ `Organism Groups` <chr> "Eukaryota;Protists;Other Protists", "Eukaryota;Pla…
## $ Strain            <chr> "CCMP1516", NA, "A17", NA, NA, NA, NA, NA, NA, "B80…
## $ BioSample         <chr> "SAMN02744062", "SAMN03081427", "SAMN02299339", "SA…
## $ BioProject        <chr> "PRJNA77753", "PRJNA10719", "PRJNA10791", "PRJNA198…
## $ Assembly          <chr> "GCA_000372725.1", "GCA_000001735.1", "GCA_00021949…
## $ Level             <chr> "Scaffold", "Chromosome", "Chromosome", "Chromosome…
## $ `Size(Mb)`        <dbl> 167.67600, 119.66800, 412.92400, 978.97200, 823.786…
## $ `GC%`             <dbl> 64.5000, 36.0528, 34.0470, 35.1208, 35.7097, 0.0000…
## $ Replicons         <chr> NA, "chromosome 1:NC_003070.9/CP002684.1; chromosom…
## $ WGS               <chr> "AHAL01", NA, "APNO01", "ACUP02", "AEKE02", "FJWB02…
## $ Scaffolds         <dbl> 7795, 7, 2187, 1191, 3224, 72295, 58, 279439, 598, …
## $ CDS               <dbl> 38554, 48350, 57661, 71525, 36010, 0, 41070, 0, 584…
## $ `Release Date`    <dttm> 2013-04-19, 2001-08-13, 2011-08-12, 2010-01-05, 20…
## $ `GenBank FTP`     <chr> "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/372…
## $ `RefSeq FTP`      <chr> "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/372…

Selección de columnas de interés

eukaryotes %>% select(c("#Organism Name","Organism Groups","Strain","Size(Mb)","GC%","CDS")) -> eukaryotes_subset
  
eukaryotes_subset %>% head()
## # A tibble: 6 x 6
##   `#Organism Name`       `Organism Groups`        Strain  `Size(Mb)` `GC%`   CDS
##   <chr>                  <chr>                    <chr>        <dbl> <dbl> <dbl>
## 1 Emiliania huxleyi CCM… Eukaryota;Protists;Othe… CCMP15…       168.  64.5 38554
## 2 Arabidopsis thaliana   Eukaryota;Plants;Land P… <NA>          120.  36.1 48350
## 3 Medicago truncatula    Eukaryota;Plants;Land P… A17           413.  34.0 57661
## 4 Glycine max            Eukaryota;Plants;Land P… <NA>          979.  35.1 71525
## 5 Solanum lycopersicum   Eukaryota;Plants;Land P… <NA>          824.  35.7 36010
## 6 Hordeum vulgare        Eukaryota;Plants;Land P… <NA>         9789.   0       0

Renombrar columnas

eukaryotes_subset %>% rename(organism="#Organism Name",groups="Organism Groups",genome_size_mb="Size(Mb)",gc_percent="GC%") %>% rename_with(tolower) -> eukaryotes_subset

eukaryotes_subset %>% head()
## # A tibble: 6 x 6
##   organism           groups              strain  genome_size_mb gc_percent   cds
##   <chr>              <chr>               <chr>            <dbl>      <dbl> <dbl>
## 1 Emiliania huxleyi… Eukaryota;Protists… CCMP15…           168.       64.5 38554
## 2 Arabidopsis thali… Eukaryota;Plants;L… <NA>              120.       36.1 48350
## 3 Medicago truncatu… Eukaryota;Plants;L… A17               413.       34.0 57661
## 4 Glycine max        Eukaryota;Plants;L… <NA>              979.       35.1 71525
## 5 Solanum lycopersi… Eukaryota;Plants;L… <NA>              824.       35.7 36010
## 6 Hordeum vulgare    Eukaryota;Plants;L… <NA>             9789.        0       0

Separar columna “groups” en tres: “domain”, “kingdom”, “class”

eukaryotes_subset %>% separate(groups,c("domain","kingdom","class"),sep = ";") -> eukaryotes_subset

eukaryotes_subset %>% head()
## # A tibble: 6 x 8
##   organism       domain  kingdom  class   strain genome_size_mb gc_percent   cds
##   <chr>          <chr>   <chr>    <chr>   <chr>           <dbl>      <dbl> <dbl>
## 1 Emiliania hux… Eukary… Protists Other … CCMP1…           168.       64.5 38554
## 2 Arabidopsis t… Eukary… Plants   Land P… <NA>             120.       36.1 48350
## 3 Medicago trun… Eukary… Plants   Land P… A17              413.       34.0 57661
## 4 Glycine max    Eukary… Plants   Land P… <NA>             979.       35.1 71525
## 5 Solanum lycop… Eukary… Plants   Land P… <NA>             824.       35.7 36010
## 6 Hordeum vulga… Eukary… Plants   Land P… <NA>            9789.        0       0

Visualización de datos

¿Cuántos genomas hay por cada grupo?

eukaryotes_subset %>% count(kingdom) %>% arrange(desc(n))
## # A tibble: 5 x 2
##   kingdom      n
##   <chr>    <int>
## 1 Fungi     2844
## 2 Animals   1250
## 3 Protists   539
## 4 Plants     478
## 5 Other       20

Histograma contando el número de obseraciones para una variable:

eukaryotes_subset %>% ggplot(aes(x=kingdom)) + geom_histogram(stat = "count")
## Warning: Ignoring unknown parameters: binwidth, bins, pad

Podemos hacer la misma gráfica a partir de dos variables (una categórica y otra con valores núméricos):

eukaryotes_subset %>% group_by(kingdom) %>% count() %>% ggplot(aes(x=kingdom,y=n)) + 
  geom_col()

#reordenar
#Colores (fill, scale_fill_brewer(palette = "Set1"))
#Añadir capas (p.ej.  coord_flip())
#Transparencia  
#etiquetas de los ejes, labs(), (símbolos, ej.: expression(paste("Etiqueta ",mu)))
# Modificar tema: theme_cowplot()

Explorar tamaños de genoma:

eukaryotes_subset %>% group_by(kingdom) %>% summarise(mean=mean(genome_size_mb),sd=sd(genome_size_mb),max=max(genome_size_mb),min=min(genome_size_mb))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 5 x 5
##   kingdom    mean     sd    max    min
##   <chr>     <dbl>  <dbl>  <dbl>  <dbl>
## 1 Animals  1132.  1434.  32394.  0.663
## 2 Fungi      29.9   20.2   216.  2.08 
## 3 Other     108.   129.    543. 12.1  
## 4 Plants   1023.  2738.  27603.  0.986
## 5 Protists   46.7   57.1   808.  1.45

Gráfica de puntos

eukaryotes_subset %>% ggplot(.,aes(x=kingdom,y=genome_size_mb)) +
  geom_point()

Boxplot

eukaryotes_subset %>% ggplot(.,aes(x=kingdom,y=genome_size_mb)) +
  geom_point() + geom_boxplot()

Violin plot

eukaryotes_subset %>% ggplot(.,aes(x=kingdom,y=genome_size_mb)) + geom_violin()

# Zoom, Fungi
# Ejemplo de etiquetas para el eje y: scale_y_continuous(breaks = seq(0,250,by=30))
#Añadir observaciones + geom_jitter(width=0.15, alpha=0.3)

Graficar dos variables (CDS vs Genome size):

eukaryotes_subset %>% filter(genome_size_mb >0 & cds > 0) %>% ggplot(.,aes(x=cds,y=genome_size_mb,color=kingdom, size=cds)) +
  geom_point(alpha=0.5) -> p
p

### Hacer versión interactiva

#library(plotly)
#library(cowplot)
options(scipen = 999) #Opción para inhabilitar el uso de notación científica (se usará para mostrar los valores decimales en las etiquetas)
eukaryotes_subset %>% ggplot(.,aes(x=kingdom,y=genome_size_mb,color=kingdom)) +
  geom_point(alpha=0.7) + geom_boxplot() -> p

ggplotly(p) 
# Simplificar etiquetas: mutate(genome_size_mb=round(genome_size_mb,2))
# Añadir etiquetas a la estética general: text= paste("organism:",organism,"\ncds:",cds)
#Añadir tema: +theme_cowplot()
# Elegir campos para presentar en los recuadros: tooltip = c("text","x","y")
# Modificar menú de gráfica Plotly %>% config(displayModeBar = F) 
# Modificar fondo de la etiqueta: %>% layout(hoverlabel=list(bgcolor="white"))